#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
suppressWarnings(suppressMessages(library(WGCNA)))

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]

# Correct SFARI Scores
dataset$gene.score = genes_info$gene.score

# Enrichment Analysis
load('./../Data/enrichmentAnalysis.RData')


SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, GO_neuronal)

Calculate the Enrichment of SFARI Genes in Modules


To measure the presence of SFARI Genes in a module, at first we were just measuring the percentage of genes in the module that belonge to SFARI, but this approach does not take into account the size of the module, and because of this it doesn’t take into consideration the robustness of the results (it’s easier to get a high percentage of SFARI genes in a small module by chance than in a larger module). Because of this, we chose to use the following approach:

If we assume independence between SFARI Genes and modules, we can calculate the probability of obtaining a proportion of SFARI Genes at least as big as the one found in each module given its size


Notation:


If we interpret the number of genes (\(n\)) in a module as \(n\) random draws without replacement from a finite population of size \(N\), and the number of SFARI genes in the module (\(k\)) as \(k\) successes in those \(n\) draws, where we know that \(N\) contains exactly \(K\) successes, then we can use the Hypergeometric Distribution to calculate the statistical significance of having drawn \(k\) successes out of \(n\) draws, and use this value to select the clusters with the highest enrichment



Relation between SFARI Enrichment and Module-Diagnosis correlation


  • For Modules with negative correlation to ASD there doesn’t seem to be a relation between this correlation and the enrichment of the SFARI Genes of the module, but for Modules with positive correlation, the higher the correlation, the smaller the enrichment of SFARI Genes

  • SFARI genes seem to be strongly concentrated in a group of modules and almost abstent in others (many modules with a probability close to 1 and many with a probability close to 0, with not so many in between)

  • The size of the modules does play a part in the probabilities, with the smallest modules having on average less extreme probabilities than the rest of the modules

SFARI_genes_by_module = dataset %>% mutate('hasSFARIscore' = !gene.score %in% c('None', 'Neuronal')) %>% 
                        group_by(Module, MTcor, hasSFARIscore) %>% summarise(s=n()) %>% 
                        left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
                        mutate(perc=round(s/n*100,2)) %>% filter(hasSFARIscore & Module != 'gray') %>% arrange(desc(perc)) %>% ungroup

N = sum(SFARI_genes_by_module$n)
S = sum(SFARI_genes_by_module$s)

calc_prob = function(row, log.p){
  s = row[4] %>% as.numeric
  n = row[5] %>% as.numeric
  prob = phyper(s,S,N-S,n, log.p = log.p, lower.tail=FALSE)
  return(prob)
}

SFARI_genes_by_module$prob = apply(SFARI_genes_by_module, 1, function(x) calc_prob(x, FALSE))
SFARI_genes_by_module$adj_prob = p.adjust(SFARI_genes_by_module$prob, method = 'bonferroni')
SFARI_genes_by_module = SFARI_genes_by_module %>% arrange(prob)

ggplotly(SFARI_genes_by_module %>% ggplot(aes(MTcor, prob, size=n)) + geom_point(color=SFARI_genes_by_module$Module, alpha=0.5, aes(id=Module)) + 
         geom_smooth(color='#cccccc', size = 0.5, se=FALSE) + xlab('Module-Diagnosis Correlation') + ylab('Probability') + 
         ggtitle(paste0('Corr = ', round(cor(SFARI_genes_by_module$MTcor, SFARI_genes_by_module$prob),2), ': Corr[Module-ASD corr<0] = ', 
                        round(cor(SFARI_genes_by_module$MTcor[SFARI_genes_by_module$MTcor<0], SFARI_genes_by_module$prob[SFARI_genes_by_module$MTcor<0]),3),
                        ' Corr[Module-ASD corr>0] = ',
                        round(cor(SFARI_genes_by_module$MTcor[SFARI_genes_by_module$MTcor>=0], SFARI_genes_by_module$prob[SFARI_genes_by_module$MTcor>=0]),2))) +
         theme_minimal() + theme(legend.position = 'none'))

It’s weird that the Module with the highest (positive) correlation to ASD have less enrichment in SFARI Genes than the rest of the Modules. This seems to be because =eEven though SFARI Genes are quite balanced between over-and under-expressed gemes, they have lower lFC values than the rest of the genes in the over-expressed group

# !diagnostics off
plot_data = genes_info %>% mutate(label = ifelse(!gene.score %in% c('Neuronal', 'None'), 'SFARI', gene.score)) %>% dplyr::select(label, log2FoldChange) %>%
            mutate(class = factor(label, levels = c('None', 'Neuronal', 'SFARI')),
                   quant = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>% 
            filter(label == 'SFARI') %>% group_by(quant) %>% tally %>% ungroup

ggplotly(genes_info %>% mutate(direction = factor(ifelse(log2FoldChange<0, 'under-expressed', 'over-expressed'), levels = c('under-expressed', 'over-expressed')),
                               label = factor(ifelse(!gene.score %in% c('Neuronal', 'None'), 'SFARI', gene.score), levels = c('None', 'Neuronal', 'SFARI'))) %>% 
         ggplot(aes(x=direction, fill = label)) + geom_bar(position = 'fill') + ggtitle('Proportion of SFARI Genes for under- and over-expressed genes') +
         scale_fill_manual(values = c('#b3b3b3','#808080','#ff6600')) + ylab('Proportion') + xlab('Direction') + scale_y_sqrt() + theme_minimal())
ggplotly(plot_data %>% ggplot(aes(quant, n)) + geom_bar(stat = 'identity', fill = '#ff6600') + geom_smooth(color = 'gray', alpha = 0.3) + 
         xlab('Log Fold Change Quantiles') + ylab('Number of SFARI Genes') + ggtitle('Number of SFARI Genes for lFC Quantiles') + theme_minimal())



Selecting Top Modules Enriched in SFARI Genes


We can interpet the probability we obtain as a p-value (and correct it for multiple testing), we can use it as a threshold to identify modules with a significantly high percentage of SFARI genes (adjusted p-value < 0.01)

Using log-scale to help us differentiate between small differences close to zero better

ggplotly(SFARI_genes_by_module %>% ggplot(aes(MTcor, adj_prob, size=n)) + geom_point(color=SFARI_genes_by_module$Module, alpha=0.5, aes(id=Module)) +
         geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') + xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none'))
top_modules = SFARI_genes_by_module$Module[SFARI_genes_by_module$adj_prob <0.05]

cat(paste0('Keeping top ', length(top_modules),' modules: ', paste(top_modules, collapse = ', ')))
## Keeping top 6 modules: #B79F00, #FE61CF, #00A8FF, #619CFF, #8195FF, #00B9E3
rm(N,S)

Exploratory Analysis of Top Modules


PCA


The genes belonging to the modules enriched in SARI genes seem to be distributed in all of the PC space except for the highest values of PC2, which we know correspond to over-expressed genes. This agrees with the positive slope at the end of the trend line in the plot above

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
            dplyr::select(ID, external_gene_id, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.1, 0.3),
                   gene_id = paste0(ID, ' (', external_gene_id, ')'))

table(plot_data$ImportantModules)
## 
## #00A8FF #00B9E3 #619CFF #8195FF #B79F00 #FE61CF  Others 
##     270    1733     459      40     174     315   13156
p = plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + geom_point(alpha=plot_data$alpha) + 
    scale_colour_manual(values =  c(names(table(plot_data$ImportantModules))[-(length(top_modules)+1)],'gray')) + 
    theme_minimal() + theme(legend.position = 'none') + ggtitle('Modules with the most significant presence of SFARI Genes')

for(tm in top_modules){
  p = p + geom_hline(yintercept = mean(plot_data$PC2[plot_data$Module==tm]), color = tm, linetype = 'dashed') + 
          geom_vline(xintercept = mean(plot_data$PC1[plot_data$Module==tm]), color = tm, linetype = 'dashed')
}

ggExtra::ggMarginal(p, type='density', groupColour = TRUE, size=10)


Functional Enrichment Analysis

i = 1
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #B79F00 (SFARI Genes = 16.09%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.0008823 0.0000011 1.850606 168 4036 80
GO:0051252 regulation of RNA metabolic process GO|GO.BP 0.2041583 0.0001640 1.881450 168 3027 61
GO:0090304 nucleic acid metabolic process GO|GO.BP 0.2908464 0.0002255 1.670297 168 4304 77
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 0.2970930 0.0002299 1.847594 168 3133 62
GO:0019219 regulation of nucleobase-containing compound metabolic process GO|GO.BP 0.3413090 0.0002605 1.812876 168 3296 64
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0007780 1.778891 168 3254 62
GO:0031326 regulation of cellular biosynthetic process GO|GO.BP 1.0000000 0.0007603 1.754842 168 3405 64
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.0000000 0.0006901 1.748373 168 3471 65
GO:0016070 RNA metabolic process GO|GO.BP 1.0000000 0.0010016 1.683317 168 3827 69
GO:0005634 nucleus GO|GO.CC 1.0000000 0.0013611 1.475657 168 5884 93
i = 2
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #FE61CF (SFARI Genes = 12.06%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000188 CortexWGCNA midfetal M18 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0000000 0.0000000 7.058003 313 426 60
JAMiller.AIBS.000195 RegionalWGCNA midfetal M25 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0000000 0.0000000 8.176139 313 190 31
JAMiller.AIBS.000048 CortexWGCNA 15-21 post-conception weeks C22 CPenriched enrichedForAutismGenes JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.0000000 0.0000000 3.962714 313 607 48
JAMiller.AIBS.000044 CortexWGCNA 15-21 post-conception weeks C18 JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.0000097 0.0000000 3.140713 313 718 45
JAMiller.AIBS.000268 Genes bound by EGR1 in HUMAN ERYTHROLEUKEMIA from PMID 20690147 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0001385 0.0000002 1.556912 313 4828 150
JAMiller.AIBS.000155 Lowest in VZ of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 0.0004462 0.0000006 2.138185 313 1664 71
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.0006891 0.0000009 1.614107 313 4036 130
JAMiller.AIBS.000257 Genes bound by DMRT1 in MOUSE TESTES from PMID 23473982 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0747682 0.0000662 1.969328 313 1654 65
JAMiller.AIBS.000245 Genes bound by CREB1 in RAT HIPPOCAMPUS from PMID 23762244 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.1714689 0.0001401 1.823144 313 2034 74
JAMiller.AIBS.000124 HippocampusWGCNA yellow JA Miller at AIBS|Brain|Postnatal brain|WGCNA 0.3129563 0.0002413 2.590714 313 677 35
i = 3
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #00A8FF (SFARI Genes = 11.85%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000188 CortexWGCNA midfetal M18 JA Miller at AIBS|Brain|Prenatal brain|WGCNA 0.0007389 0.0000010 3.949958 261 426 28
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.0017535 0.0000021 1.824771 261 2964 90
JAMiller.AIBS.000161 Genes increasing with age in fetal PFC JA Miller at AIBS|Brain|Prenatal brain|Age-associated genes|Cortex 1.0000000 0.0229150 2.258634 261 745 28
JAMiller.AIBS.000078 Cerebellar granule cells JA Miller at AIBS|Brain|Postnatal brain|Cell type markers 1.0000000 0.0197433 1.827103 261 1513 46
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0207200 1.515342 261 3133 79
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.0000000 0.0150683 1.514399 261 3254 82
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 1.0000000 0.0030512 1.488994 261 4036 100
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.0000000 0.0260383 1.471663 261 3471 85
JAMiller.AIBS.000268 Genes bound by EGR1 in HUMAN ERYTHROLEUKEMIA from PMID 20690147 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0269462 1.369208 261 4828 110
GO:0050789 regulation of biological process GO|GO.BP 1.0000000 0.0224722 1.207686 261 8957 180
i = 4
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #619CFF (SFARI Genes = 9.8%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
GO:0090304 nucleic acid metabolic process GO|GO.BP 0.00e+00 0e+00 1.753458 451 4304 217
GO:0016070 RNA metabolic process GO|GO.BP 0.00e+00 0e+00 1.817521 451 3827 200
GO:0010467 gene expression GO|GO.BP 0.00e+00 0e+00 1.678789 451 4454 215
GO:0005634 nucleus GO|GO.CC 0.00e+00 0e+00 1.536769 451 5884 260
GO:0003676 nucleic acid binding GO|GO.MF 0.00e+00 0e+00 1.839548 451 3214 170
GO:0006139 nucleobase-containing compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.610303 451 4881 226
GO:0046483 heterocycle metabolic process GO|GO.BP 0.00e+00 0e+00 1.578302 451 5002 227
GO:0006725 cellular aromatic compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.567335 451 5037 227
GO:0051252 regulation of RNA metabolic process GO|GO.BP 0.00e+00 0e+00 1.826807 451 3027 159
GO:0031981 nuclear lumen GO|GO.CC 0.00e+00 0e+00 1.709029 451 3724 183
GO:0019219 regulation of nucleobase-containing compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.772679 451 3296 168
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.00e+00 0e+00 1.830435 451 2964 156
GO:0034641 cellular nitrogen compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.507378 451 5445 236
GO:0044428 nuclear part GO|GO.CC 0.00e+00 0e+00 1.637542 451 4014 189
GO:1901360 organic cyclic compound metabolic process GO|GO.BP 0.00e+00 0e+00 1.518497 451 5199 227
GO:0010468 regulation of gene expression GO|GO.BP 0.00e+00 0e+00 1.683595 451 3615 175
GO:0005654 nucleoplasm GO|GO.CC 2.00e-07 0e+00 1.723000 451 3169 157
GO:0006355 regulation of transcription, DNA-templated GO|GO.BP 3.00e-07 0e+00 1.785435 451 2766 142
GO:0006351 transcription, DNA-templated GO|GO.BP 8.00e-07 0e+00 1.740690 451 2937 147
GO:1903506 regulation of nucleic acid-templated transcription GO|GO.BP 9.00e-07 0e+00 1.759835 451 2826 143
GO:0032774 RNA biosynthetic process GO|GO.BP 9.00e-07 0e+00 1.729050 451 2997 149
GO:2001141 regulation of RNA biosynthetic process GO|GO.BP 1.10e-06 0e+00 1.754248 451 2835 143
JAMiller.AIBS.000209 RegionalWGCNA midfetal M39 AlternativeSplicing JA Miller at AIBS|Brain|Prenatal brain|WGCNA 1.20e-06 0e+00 3.058089 451 580 51
GO:0097659 nucleic acid-templated transcription GO|GO.BP 2.80e-06 0e+00 1.714422 451 2982 147
GO:2000112 regulation of cellular macromolecule biosynthetic process GO|GO.BP 3.50e-06 0e+00 1.687296 451 3133 152
GO:0031326 regulation of cellular biosynthetic process GO|GO.BP 4.60e-06 0e+00 1.644435 451 3405 161
GO:1901363 heterocyclic compound binding GO|GO.MF 8.00e-06 0e+00 1.489880 451 4832 207
GO:0003677 DNA binding GO|GO.MF 1.12e-05 0e+00 1.947733 451 1857 104
GO:0009889 regulation of biosynthetic process GO|GO.BP 1.12e-05 0e+00 1.623186 451 3471 162
GO:0010556 regulation of macromolecule biosynthetic process GO|GO.BP 1.72e-05 0e+00 1.645929 451 3254 154
GO:0097159 organic cyclic compound binding GO|GO.MF 2.51e-05 0e+00 1.473414 451 4886 207
GO:0034654 nucleobase-containing compound biosynthetic process GO|GO.BP 2.51e-05 0e+00 1.611310 451 3475 161
GO:0060255 regulation of macromolecule metabolic process GO|GO.BP 2.73e-05 0e+00 1.469703 451 4922 208
GO:0051171 regulation of nitrogen compound metabolic process GO|GO.BP 3.59e-05 1e-07 1.485616 451 4682 200
GO:0044451 nucleoplasm part GO|GO.CC 3.83e-05 1e-07 2.388518 451 961 66
GO:0043170 macromolecule metabolic process GO|GO.BP 3.99e-05 1e-07 1.315338 451 7562 286
GO:0018130 heterocycle biosynthetic process GO|GO.BP 4.45e-05 1e-07 1.596509 451 3529 162
GO:0019438 aromatic compound biosynthetic process GO|GO.BP 5.00e-05 1e-07 1.594250 451 3534 162
i = 5
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #8195FF (SFARI Genes = 17.5%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000349 Genes bound by KDM5B in MOUSE MESC from PMID 21448134 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.2506351 0.0001972 2.985138 39 2964 22
JAMiller.AIBS.000250 Genes bound by CTNNB1 in HUMAN HCT116 from PMID 20460455 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 0.6636308 0.0004740 5.738590 39 841 12
GO:0048712 negative regulation of astrocyte differentiation GO|GO.BP 1.0000000 0.1031902 73.123543 39 11 2
GO:0060840 artery development GO|GO.BP 1.0000000 0.0221715 19.860715 39 81 4
JAMiller.AIBS.000111 HippocampusWGCNA green JA Miller at AIBS|Brain|Postnatal brain|WGCNA 1.0000000 0.0544297 5.939360 39 474 7
JAMiller.AIBS.000052 CortexWGCNA 15-21 post-conception weeks C26 JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 1.0000000 0.0244813 5.020271 39 721 9
JAMiller.AIBS.000509 Genes bound by TAL1 in MOUSE PRIMARY FETAL LIVER ERYTHROID CELLS from PMID 20566737 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0535195 3.312391 39 1457 12
JAMiller.AIBS.000245 Genes bound by CREB1 in RAT HIPPOCAMPUS from PMID 23762244 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0250917 2.965925 39 2034 15
JAMiller.AIBS.000554 Genes bound by YAP1 in MOUSE MESC from PMID 20516196 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 1.0000000 0.0882643 2.896584 39 1805 13
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 1.0000000 0.0213181 2.192257 39 4036 22
i = 6
kable(enrichment$enrichmentTable %>% filter(class==top_modules[i]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio, effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[i], ' (SFARI Genes = ',
                       round(SFARI_genes_by_module$perc[SFARI_genes_by_module$Module==top_modules[i]][1],4), '%)'))
Enriched terms for module #00B9E3 (SFARI Genes = 6.46%)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000052 CortexWGCNA 15-21 post-conception weeks C26 JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.00e+00 0e+00 2.708083 1695 721 211
JAMiller.AIBS.000142 Highest in CP of 13-16 post-conception weeks human JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 2.194883 1695 1210 287
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.00e+00 0e+00 1.524703 1695 4036 665
JAM:002769 downAD_mitochondrion JAM|BrainLists|BrainLists.Blalock_AD 0.00e+00 0e+00 3.364977 1695 264 96
JAMiller.AIBS.000150 Highest in CP of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 1.878513 1695 1266 257
JAM:003016 downAD_synapticTransmission JAM|BrainLists|BrainLists.Blalock_AD 0.00e+00 0e+00 5.152621 1695 88 49
JAMiller.AIBS.000155 Lowest in VZ of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 0.00e+00 0e+00 1.723944 1695 1664 310
JAMiller.AIBS.000123 HippocampusWGCNA turquoise DGenriched upAge JA Miller at AIBS|Brain|Postnatal brain|WGCNA 0.00e+00 0e+00 1.921531 1695 1098 228
JAMiller.AIBS.000570 WGCNA Olivedrab2ModuleGenes with enriched ELAVL2 targets JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.00e+00 0e+00 2.481758 1695 481 129
JAM:003072 Tail of Caudate Nucleus_IN_Striatum JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.506055 1695 161 61
GO:0044456 synapse part GO|GO.CC 0.00e+00 0e+00 1.967674 1695 823 175
GO:0045202 synapse GO|GO.CC 0.00e+00 0e+00 1.834783 1695 1044 207
GO:0097060 synaptic membrane GO|GO.CC 0.00e+00 0e+00 2.418297 1695 375 98
JAMiller.AIBS.000141 CP enriched in E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.00e+00 0e+00 2.112789 1695 565 129
GO:0097458 neuron part GO|GO.CC 0.00e+00 0e+00 1.651046 1695 1418 253
JAM:002764 downAging_mitochondria_synapse JAM|BrainLists|BrainLists.Lu_Aging 0.00e+00 0e+00 2.349013 1695 390 99
JAM:002751 Basal Pons JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.196728 1695 165 57
JAM:002744 Autism_differential_expression_across_at_least_one_comparison JAM|BrainLists|BrainLists.Voineagu 0.00e+00 0e+00 1.855589 1695 763 153
JAM:003054 subiculum_IN_Hippocampal Formation JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 0.00e+00 0e+00 3.065639 1695 163 54
JAMiller.AIBS.000005 CPi markers at 21 post-conception weeks JA Miller at AIBS|Brain|Prenatal brain|Cortex|Markers of cortex layers|Postmitotic brain 1.00e-07 0e+00 2.395777 1695 309 80
JAM:002805 Cerebral Cortex JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 2.00e-07 0e+00 2.988769 1695 161 52
JAM:002739 arcuate nucleus of medulla_IN_Myelencephalon JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 2.00e-07 0e+00 2.936799 1695 167 53
JAM:002920 Lateral Nucleus_IN_Amygdala JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 5.00e-07 0e+00 2.916314 1695 165 52
GO:0099536 synaptic signaling GO|GO.BP 5.00e-07 0e+00 1.949884 1695 560 118
GO:0034220 ion transmembrane transport GO|GO.BP 7.00e-07 0e+00 1.720897 1695 898 167
GO:0098793 presynapse GO|GO.CC 1.00e-06 0e+00 2.108154 1695 417 95
JAMiller.AIBS.000095 Cortical PNOC neurons JA Miller at AIBS|Brain|Postnatal brain|Cell type markers|Cortex 2.50e-06 0e+00 1.282364 1695 3940 546
GO:0045211 postsynaptic membrane GO|GO.CC 3.10e-06 0e+00 2.346005 1695 284 72
GO:0099537 trans-synaptic signaling GO|GO.BP 3.40e-06 0e+00 1.917431 1695 555 115
JAM:002882 Hippocampal Formation JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.globalMarker(top200)|Brain region markers|Brain region marker enriched gene sets 4.00e-06 0e+00 2.809155 1695 168 51
GO:0007268 chemical synaptic transmission GO|GO.BP 4.10e-06 0e+00 1.918037 1695 550 114
GO:0098916 anterograde trans-synaptic signaling GO|GO.BP 4.10e-06 0e+00 1.918037 1695 550 114
JAMiller.AIBS.000463 Genes bound by SMAD4 in HUMAN A2780 from PMID 21799915 JA Miller at AIBS|Chip-X enrichment analysis (ChEA) 9.70e-06 0e+00 1.415427 1695 2079 318
GO:0043005 neuron projection GO|GO.CC 1.07e-05 0e+00 1.625648 1695 1036 182
GO:0030424 axon GO|GO.CC 1.10e-05 0e+00 1.942358 1695 505 106
GO:0022836 gated channel activity GO|GO.MF 2.52e-05 0e+00 2.420796 1695 237 62
JAMiller.AIBS.000042 CortexWGCNA 15-21 post-conception weeks C16 SPenriched JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 2.81e-05 0e+00 2.570469 1695 198 55
JAM:002824 Dentate Nucleus_IN_Cerebellar Nucleus JAM|BrainRegionMarkers|BrainRegionMarkers.HBA|BrainRegionMarkers.HBA.localMarker(top200)|Brain region markers|Brain region marker enriched gene sets 5.67e-05 1e-07 2.725012 1695 163 48
GO:0034702 ion channel complex GO|GO.CC 5.72e-05 1e-07 2.437355 1695 224 59
GO:0005216 ion channel activity GO|GO.MF 6.05e-05 1e-07 2.256199 1695 283 69
GO:0022839 ion gated channel activity GO|GO.MF 6.28e-05 1e-07 2.391843 1695 236 61
GO:1902495 transmembrane transporter complex GO|GO.CC 7.84e-05 1e-07 2.361023 1695 243 62
GO:0099240 intrinsic component of synaptic membrane GO|GO.CC 8.40e-05 1e-07 2.764088 1695 154 46



Comparison with original SFARI Genes

SFARI_genes_old = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes_old = SFARI_genes_old[!duplicated(SFARI_genes_old$ID) & !is.na(SFARI_genes_old$ID),]

old_SFARI_genes_by_module = dataset %>% dplyr::select(ID, Module, MTcor) %>% left_join(SFARI_genes_old, by = 'ID') %>%
                            mutate('hasSFARIscore' = !is.na(`gene-score`)) %>% 
                            group_by(Module, MTcor, hasSFARIscore) %>% summarise(s=n()) %>% 
                            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
                            mutate(perc=round(s/n*100,2)) %>% filter(hasSFARIscore & Module != 'gray') %>% arrange(desc(perc))

N = sum(old_SFARI_genes_by_module$n)
S = sum(old_SFARI_genes_by_module$s)

old_SFARI_genes_by_module$prob = apply(old_SFARI_genes_by_module, 1, function(x) calc_prob(x, FALSE))
old_SFARI_genes_by_module$adj_prob = p.adjust(old_SFARI_genes_by_module$prob, method = 'bonferroni')


plot_data = SFARI_genes_by_module %>% ungroup %>% dplyr::rename('adj_prob_new' = adj_prob) %>% dplyr::select(Module, adj_prob_new) %>%
            left_join(old_SFARI_genes_by_module %>% ungroup %>% dplyr::rename('adj_prob_old' = adj_prob) %>% dplyr::select(Module, adj_prob_old), by='Module')

ggplotly(plot_data %>% ggplot(aes(adj_prob_old, adj_prob_new)) + geom_point(color = plot_data$Module, alpha = 0.5, aes(id=Module)) + 
         geom_abline(a=0, b=1, color = 'gray', linetype = 'dotted') + theme_minimal() + scale_y_log10() + scale_x_log10() +
         geom_hline(yintercept = 0.05, color = '#cccccc', linetype = 'dotted') + geom_vline(xintercept = 0.05, linetype = 'dotted', color = '#cccccc') +
         xlab('Adjusted Probability with Old SFARI Scores') + ylab('Adjusted Probability with New SFARI Scores') + 
         ggtitle('SFARI Significance by Module with original an new SFARI scores'))
rm(N,S, calc_prob)



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BrainDiseaseCollection_1.00             
##  [2] anRichment_1.01-2                       
##  [3] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.7
##  [4] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 
##  [5] GenomicFeatures_1.36.4                  
##  [6] GenomicRanges_1.36.1                    
##  [7] GenomeInfoDb_1.20.0                     
##  [8] anRichmentMethods_0.90-1                
##  [9] WGCNA_1.69                              
## [10] fastcluster_1.1.25                      
## [11] dynamicTreeCut_1.63-1                   
## [12] GO.db_3.8.2                             
## [13] AnnotationDbi_1.46.1                    
## [14] IRanges_2.18.3                          
## [15] S4Vectors_0.22.1                        
## [16] Biobase_2.44.0                          
## [17] BiocGenerics_0.30.0                     
## [18] biomaRt_2.40.5                          
## [19] knitr_1.28                              
## [20] doParallel_1.0.15                       
## [21] iterators_1.0.12                        
## [22] foreach_1.5.0                           
## [23] polycor_0.7-10                          
## [24] expss_0.10.2                            
## [25] GGally_1.5.0                            
## [26] gridExtra_2.3                           
## [27] viridis_0.5.1                           
## [28] viridisLite_0.3.0                       
## [29] RColorBrewer_1.1-2                      
## [30] dendextend_1.13.4                       
## [31] plotly_4.9.2                            
## [32] glue_1.3.2                              
## [33] reshape2_1.4.3                          
## [34] forcats_0.5.0                           
## [35] stringr_1.4.0                           
## [36] dplyr_0.8.5                             
## [37] purrr_0.3.3                             
## [38] readr_1.3.1                             
## [39] tidyr_1.0.2                             
## [40] tibble_3.0.0                            
## [41] ggplot2_3.3.0                           
## [42] tidyverse_1.3.0                         
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] crosstalk_1.1.0.1           BiocParallel_1.18.1        
##   [9] digest_0.6.25               htmltools_0.4.0            
##  [11] fansi_0.4.1                 magrittr_1.5               
##  [13] checkmate_2.0.0             memoise_1.1.0              
##  [15] cluster_2.1.0               annotate_1.62.0            
##  [17] Biostrings_2.52.0           modelr_0.1.6               
##  [19] matrixStats_0.56.0          prettyunits_1.1.1          
##  [21] jpeg_0.1-8.1                colorspace_1.4-1           
##  [23] blob_1.2.1                  rvest_0.3.5                
##  [25] haven_2.2.0                 xfun_0.12                  
##  [27] crayon_1.3.4                RCurl_1.98-1.1             
##  [29] jsonlite_1.6.1              genefilter_1.66.0          
##  [31] impute_1.58.0               survival_3.1-12            
##  [33] gtable_0.3.0                zlibbioc_1.30.0            
##  [35] XVector_0.24.0              DelayedArray_0.10.0        
##  [37] scales_1.1.0                DBI_1.1.0                  
##  [39] miniUI_0.1.1.1              Rcpp_1.0.4                 
##  [41] xtable_1.8-4                progress_1.2.2             
##  [43] htmlTable_1.13.3            foreign_0.8-76             
##  [45] bit_1.1-15.2                preprocessCore_1.46.0      
##  [47] Formula_1.2-3               htmlwidgets_1.5.1          
##  [49] httr_1.4.1                  acepack_1.4.1              
##  [51] ellipsis_0.3.0              farver_2.0.3               
##  [53] pkgconfig_2.0.3             reshape_0.8.8              
##  [55] XML_3.99-0.3                nnet_7.3-14                
##  [57] dbplyr_1.4.2                locfit_1.5-9.4             
##  [59] later_1.0.0                 labeling_0.3               
##  [61] tidyselect_1.0.0            rlang_0.4.5                
##  [63] munsell_0.5.0               cellranger_1.1.0           
##  [65] tools_3.6.3                 cli_2.0.2                  
##  [67] generics_0.0.2              RSQLite_2.2.0              
##  [69] broom_0.5.5                 fastmap_1.0.1              
##  [71] evaluate_0.14               yaml_2.2.1                 
##  [73] bit64_0.9-7                 fs_1.4.0                   
##  [75] nlme_3.1-147                mime_0.9                   
##  [77] ggExtra_0.9                 xml2_1.2.5                 
##  [79] compiler_3.6.3              rstudioapi_0.11            
##  [81] curl_4.3                    png_0.1-7                  
##  [83] reprex_0.3.0                geneplotter_1.62.0         
##  [85] stringi_1.4.6               highr_0.8                  
##  [87] lattice_0.20-41             Matrix_1.2-18              
##  [89] vctrs_0.2.4                 pillar_1.4.3               
##  [91] lifecycle_0.2.0             data.table_1.12.8          
##  [93] bitops_1.0-6                httpuv_1.5.2               
##  [95] rtracklayer_1.44.4          R6_2.4.1                   
##  [97] latticeExtra_0.6-29         promises_1.1.0             
##  [99] codetools_0.2-16            assertthat_0.2.1           
## [101] SummarizedExperiment_1.14.1 DESeq2_1.24.0              
## [103] withr_2.1.2                 GenomicAlignments_1.20.1   
## [105] Rsamtools_2.0.3             GenomeInfoDbData_1.2.1     
## [107] mgcv_1.8-31                 hms_0.5.3                  
## [109] grid_3.6.3                  rpart_4.1-15               
## [111] rmarkdown_2.1               shiny_1.4.0.2              
## [113] lubridate_1.7.4             base64enc_0.1-3